# Libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.9-3. For overview type 'help("mgcv-package")'.
# Import the data subsets
RHP_RHH_df <- read_csv("RHP_RHH_bip.csv")
## Rows: 144820 Columns: 42
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (26): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
RHP_LHH_df <- read_csv("RHP_LHH_bip.csv")
## Rows: 123442 Columns: 42
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (26): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
LHP_RHH_df <- read_csv("LHP_RHH_bip.csv")
## Rows: 72971 Columns: 42
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (26): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
LHP_LHH_df <- read_csv("LHP_LHH_bip.csv")
## Rows: 26371 Columns: 42
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (26): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
GAM_RR <- gam(GIDP_SweetSpot ~
           s(spinrate) +
           s(relspeed) +
           s(inducedvertbreak) +
           s(horzbreak) +
           ti(platelocside, platelocheight) +
           s(platelocside) + s(platelocheight) +
           ti(initposx, initposz) +
           s(initposx) + s(initposz), 
         data=RHP_RHH_df,
         family = binomial,
         method = "REML")

summary(GAM_RR)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## GIDP_SweetSpot ~ s(spinrate) + s(relspeed) + s(inducedvertbreak) + 
##     s(horzbreak) + ti(platelocside, platelocheight) + s(platelocside) + 
##     s(platelocheight) + ti(initposx, initposz) + s(initposx) + 
##     s(initposz)
## 
## Parametric coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.655941   0.008127  -203.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df   Chi.sq p-value    
## s(spinrate)                     3.568  4.577   15.083 0.00728 ** 
## s(relspeed)                     4.256  5.362  245.481 < 2e-16 ***
## s(inducedvertbreak)             6.208  7.404  798.282 < 2e-16 ***
## s(horzbreak)                    4.645  5.784  195.252 < 2e-16 ***
## ti(platelocside,platelocheight) 8.861 10.144  161.537 < 2e-16 ***
## s(platelocside)                 7.654  8.543  385.655 < 2e-16 ***
## s(platelocheight)               3.436  4.373 1002.230 < 2e-16 ***
## ti(initposx,initposz)           2.349  2.862   10.512 0.01578 *  
## s(initposx)                     1.159  1.302    1.152 0.42889    
## s(initposz)                     1.116  1.211   32.486 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0274   Deviance explained = 3.15%
## -REML =  64042  Scale est. = 1         n = 144820
GAM_RL <- gam(GIDP_SweetSpot ~
           s(spinrate) +
           s(relspeed) +
           s(inducedvertbreak) +
           s(horzbreak) +
           ti(platelocside, platelocheight) +
           s(platelocside) + s(platelocheight) +
           ti(initposx, initposz) +
           s(initposx) + s(initposz), 
         data=RHP_LHH_df,
         family = binomial,
         method = "REML")

summary(GAM_RL)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## GIDP_SweetSpot ~ s(spinrate) + s(relspeed) + s(inducedvertbreak) + 
##     s(horzbreak) + ti(platelocside, platelocheight) + s(platelocside) + 
##     s(platelocheight) + ti(initposx, initposz) + s(initposx) + 
##     s(initposz)
## 
## Parametric coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.795205   0.009352    -192   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df  Chi.sq  p-value    
## s(spinrate)                     5.244  6.489  60.653  < 2e-16 ***
## s(relspeed)                     5.667  6.897 540.268  < 2e-16 ***
## s(inducedvertbreak)             5.626  6.831 589.107  < 2e-16 ***
## s(horzbreak)                    7.674  8.575 152.750  < 2e-16 ***
## ti(platelocside,platelocheight) 6.896  8.348  60.948  < 2e-16 ***
## s(platelocside)                 6.087  7.290 880.565  < 2e-16 ***
## s(platelocheight)               3.751  4.737 482.716  < 2e-16 ***
## ti(initposx,initposz)           5.219  6.476  29.635 5.62e-05 ***
## s(initposx)                     1.904  2.416  40.836  < 2e-16 ***
## s(initposz)                     1.837  2.330   7.551   0.0469 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0256   Deviance explained = 3.09%
## -REML =  50753  Scale est. = 1         n = 123442
GAM_LR <- gam(GIDP_SweetSpot ~
           s(spinrate) +
           s(relspeed) +
           s(inducedvertbreak) +
           s(horzbreak) +
           ti(platelocside, platelocheight) +
           s(platelocside) + s(platelocheight) +
           ti(initposx, initposz) +
           s(initposx) + s(initposz), 
         data=LHP_RHH_df,
         family = binomial,
         method = "REML")

summary(GAM_LR)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## GIDP_SweetSpot ~ s(spinrate) + s(relspeed) + s(inducedvertbreak) + 
##     s(horzbreak) + ti(platelocside, platelocheight) + s(platelocside) + 
##     s(platelocheight) + ti(initposx, initposz) + s(initposx) + 
##     s(initposz)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.74453    0.01163    -150   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df  Chi.sq p-value    
## s(spinrate)                     3.927  4.990  47.055 < 2e-16 ***
## s(relspeed)                     4.083  5.143 199.055 < 2e-16 ***
## s(inducedvertbreak)             5.274  6.384 312.427 < 2e-16 ***
## s(horzbreak)                    7.191  8.299  31.473 0.00014 ***
## ti(platelocside,platelocheight) 7.417  8.789  55.343 < 2e-16 ***
## s(platelocside)                 6.552  7.715 274.461 < 2e-16 ***
## s(platelocheight)               2.931  3.732 314.141 < 2e-16 ***
## ti(initposx,initposz)           1.746  2.107   1.269 0.50887    
## s(initposx)                     1.003  1.007   7.049 0.00807 ** 
## s(initposz)                     1.851  2.346   4.213 0.12602    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0226   Deviance explained =  2.7%
## -REML =  30984  Scale est. = 1         n = 72971
GAM_LL <- gam(GIDP_SweetSpot ~
           s(spinrate) +
           s(relspeed) +
           s(inducedvertbreak) +
           s(horzbreak) +
           ti(platelocside, platelocheight) +
           s(platelocside) + s(platelocheight) +
           ti(initposx, initposz) +
           s(initposx) + s(initposz), 
         data=LHP_LHH_df,
         family = binomial,
         method = "REML")

summary(GAM_LL)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## GIDP_SweetSpot ~ s(spinrate) + s(relspeed) + s(inducedvertbreak) + 
##     s(horzbreak) + ti(platelocside, platelocheight) + s(platelocside) + 
##     s(platelocheight) + ti(initposx, initposz) + s(initposx) + 
##     s(initposz)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.6759     0.0193  -86.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                                   edf Ref.df  Chi.sq  p-value    
## s(spinrate)                     1.006  1.012   1.236    0.268    
## s(relspeed)                     3.174  4.070  33.464 6.82e-07 ***
## s(inducedvertbreak)             5.494  6.678 161.309  < 2e-16 ***
## s(horzbreak)                    5.452  6.660  69.627  < 2e-16 ***
## ti(platelocside,platelocheight) 5.045  6.260  37.916  < 2e-16 ***
## s(platelocside)                 2.823  3.602  83.465  < 2e-16 ***
## s(platelocheight)               1.002  1.004 106.452  < 2e-16 ***
## ti(initposx,initposz)           1.005  1.010   0.074    0.797    
## s(initposx)                     1.002  1.004   1.322    0.251    
## s(initposz)                     1.002  1.003  19.029 1.27e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.024   Deviance explained = 2.95%
## -REML =  11464  Scale est. = 1         n = 26371
GAM_RR$sp
##                      s(spinrate)                      s(relspeed) 
##                     7.758323e+00                     2.268960e+00 
##              s(inducedvertbreak)                     s(horzbreak) 
##                     1.036654e+00                     3.942775e+00 
## ti(platelocside,platelocheight)1 ti(platelocside,platelocheight)2 
##                     1.300854e-01                     6.901210e+00 
##                  s(platelocside)                s(platelocheight) 
##                     2.422814e-01                     1.213659e+01 
##           ti(initposx,initposz)1           ti(initposx,initposz)2 
##                     1.778650e+04                     4.819995e+01 
##                      s(initposx)                      s(initposz) 
##                     6.083816e+02                     2.060975e+02
GAM_RL$sp
##                      s(spinrate)                      s(relspeed) 
##                        1.4862810                        0.8882628 
##              s(inducedvertbreak)                     s(horzbreak) 
##                        1.2006682                        0.1771016 
## ti(platelocside,platelocheight)1 ti(platelocside,platelocheight)2 
##                        3.1108314                        4.4345738 
##                  s(platelocside)                s(platelocheight) 
##                        1.0184365                        7.0181416 
##           ti(initposx,initposz)1           ti(initposx,initposz)2 
##                        6.9994421                        7.7315587 
##                      s(initposx)                      s(initposz) 
##                       35.3366361                        9.9384377
GAM_LR$sp
##                      s(spinrate)                      s(relspeed) 
##                     3.543508e+00                     1.390835e+00 
##              s(inducedvertbreak)                     s(horzbreak) 
##                     1.168677e+00                     8.071295e-02 
## ti(platelocside,platelocheight)1 ti(platelocside,platelocheight)2 
##                     3.673251e-02                     9.945469e+00 
##                  s(platelocside)                s(platelocheight) 
##                     3.034135e-01                     1.432097e+01 
##           ti(initposx,initposz)1           ti(initposx,initposz)2 
##                     2.426045e+05                     6.579429e+01 
##                      s(initposx)                      s(initposz) 
##                     1.810714e+04                     2.282283e+01
GAM_LL$sp
##                      s(spinrate)                      s(relspeed) 
##                     2.322189e+03                     2.485375e+00 
##              s(inducedvertbreak)                     s(horzbreak) 
##                     1.289867e-01                     2.700248e-01 
## ti(platelocside,platelocheight)1 ti(platelocside,platelocheight)2 
##                     3.448309e-02                     1.657037e+01 
##                  s(platelocside)                s(platelocheight) 
##                     3.630099e+00                     7.839531e+03 
##           ti(initposx,initposz)1           ti(initposx,initposz)2 
##                     4.186264e+03                     1.733346e+05 
##                      s(initposx)                      s(initposz) 
##                     1.581901e+04                     8.643324e+03
plot(GAM_RR)

Visualizations

library(ggplot2)
library(RColorBrewer)
library(rlang)
## 
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice
plot_gam_heatmap <- function(model, xvar, zvar, data, n = 200) {
  # Build a dense grid
  grid <- expand.grid(
    x = seq(min(data[[xvar]], na.rm = TRUE),
            max(data[[xvar]], na.rm = TRUE),
            length.out = n),
    z = seq(min(data[[zvar]], na.rm = TRUE),
            max(data[[zvar]], na.rm = TRUE),
            length.out = n)
  )
  names(grid) <- c(xvar, zvar)

  # Fill in other predictors
  other_vars <- setdiff(names(data), c(xvar, zvar))
  for (v in other_vars) {
    if (is.numeric(data[[v]])) {
      grid[[v]] <- mean(data[[v]], na.rm = TRUE)
    } else {
      grid[[v]] <- unique(data[[v]])[1]
    }
  }

  # Predict
  grid$fit <- predict(model, newdata = grid, type = "response")

  # Explicitly map fill inside geom_tile()
  ggplot(grid, aes(x = !!sym(xvar), y = !!sym(zvar))) +
    geom_tile(aes(fill = fit)) +
    geom_contour(aes(z = fit), color = "black", linewidth = 0.3) +
    scale_fill_gradientn(colors = rev(brewer.pal(11, "RdYlBu"))) +
    coord_equal() +
    labs(
      x = xvar,
      y = zvar,
      fill = "Predicted Probability",
      title = paste("Predicted Sweet-Spot Probability by", xvar, "and", zvar)
    ) +
    theme_minimal(base_size = 14)
}



plot_gam_heatmap(GAM_LL, "platelocside", "platelocheight", LHP_LHH_df)

plot_gam_heatmap(GAM_RR, "platelocside", "platelocheight", RHP_RHH_df)

plot_gam_heatmap(GAM_RL, "platelocside", "platelocheight", RHP_LHH_df)

plot_gam_heatmap(GAM_LR, "platelocside", "platelocheight", LHP_RHH_df)

Visualizations

plot_gam_heatmap_kzone <- function(model, data, 
                                xvar = "platelocside", 
                                zvar = "platelocheight",
                                n = 200,
                                side_range = c(-0.83, 0.83),
                                height_range = c(1.5, 3.5)) {
  # grid only within the strike zone
  grid <- expand.grid(
    x = seq(side_range[1], side_range[2], length.out = n),
    z = seq(height_range[1], height_range[2], length.out = n)
  )
  names(grid) <- c(xvar, zvar)

  # keep other predictors fixed
  other_vars <- setdiff(names(data), c(xvar, zvar))
  for (v in other_vars) {
    if (is.numeric(data[[v]])) {
      grid[[v]] <- mean(data[[v]], na.rm = TRUE)
    } else {
      grid[[v]] <- unique(data[[v]])[1]
    }
  }

  # predict only inside strike zone
  grid$fit <- predict(model, newdata = grid, type = "response")

  ggplot(grid, aes(x = !!sym(xvar), y = !!sym(zvar))) +
    geom_tile(aes(fill = fit)) +
    geom_contour(aes(z = fit), color = "black", linewidth = 0.3) +
    scale_fill_gradientn(colors = rev(brewer.pal(11, "RdYlBu"))) +
    coord_equal() +
    scale_y_continuous(limits = c(1.5, 3.5)) +   # normal (not reversed)
    geom_rect(aes(xmin = -0.83, xmax = 0.83, ymin = 1.5, ymax = 3.5),
              color = "white", fill = NA, linewidth = 1) +
    labs(
      x = "Plate Side (ft)",
      y = "Plate Height (ft)",
      fill = "Predicted Probability",
      title = "Predicted Sweet-Spot Probability within Strike Zone"
    ) +
    theme_minimal(base_size = 14)
}

plot_gam_heatmap_kzone(GAM_LL, LHP_LHH_df)
## Warning: Removed 400 rows containing missing values or values outside the scale range
## (`geom_tile()`).

plot_gam_heatmap_kzone(GAM_RR, RHP_RHH_df)
## Warning: Removed 400 rows containing missing values or values outside the scale range
## (`geom_tile()`).

plot_gam_heatmap_kzone(GAM_RL, RHP_LHH_df)
## Warning: Removed 400 rows containing missing values or values outside the scale range
## (`geom_tile()`).

plot_gam_heatmap_kzone(GAM_LR, LHP_RHH_df)
## Warning: Removed 400 rows containing missing values or values outside the scale range
## (`geom_tile()`).

library(gratia)
## 
## Attaching package: 'gratia'
## The following object is masked from 'package:stringr':
## 
##     boundary
draw(GAM_LL, select = "s(relspeed)", residuals = TRUE)

draw(GAM_LL, select = "s(spinrate)", residuals = TRUE)

draw(GAM_LL, select = "s(inducedvertbreak)", residuals = TRUE)

draw(GAM_LL, select = "s(horzbreak)", residuals = TRUE)

Pitch Type Analysis

Each pitch is assigned to a cluster (0-4) based on physical pitch traits

RHP_RHH_by_cluster <- read_csv('RHP_RHH_GIDP_ByPitchCluster.csv')
## Rows: 144820 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (27): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
RHP_LHH_by_cluster <- read_csv('RHP_LHH_GIDP_ByPitchCluster.csv')
## Rows: 123442 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (27): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
LHP_RHH_by_cluster <- read_csv('LHP_RHH_GIDP_ByPitchCluster.csv')
## Rows: 72971 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (27): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
LHP_LHH_by_cluster <- read_csv('LHP_LHH_GIDP_ByPitchCluster.csv')
## Rows: 26371 Columns: 43
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (15): gameid, pitcher, pitcherthrows, batter, batterside, pitchresult, ...
## dbl  (27): ab, pitchnum, inning, teambat, balls, strikes, outs, visscore, ho...
## time  (1): GameDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inference by pitch type + location

Hold pitch traits and release point constant, while changing pitch location.

# Average pitch type traits
cluster_summary_rr <- RHP_RHH_by_cluster %>%
  group_by(Cluster) %>%
  summarise(
    spinrate = mean(spinrate, na.rm = TRUE),
    relspeed = mean(relspeed, na.rm = TRUE),
    inducedvertbreak = mean(inducedvertbreak, na.rm = TRUE),
    horzbreak = mean(horzbreak, na.rm = TRUE),
    initposx = mean(initposx, na.rm = TRUE),
    initposz = mean(initposz, na.rm = TRUE)
  )

# strikezone
k_zone_height_max <- 3.67
k_zone_heightmin <- 1.52
sides <- c(-0.83, 0.83)

plot_heatmap_per_pitchtype <- function(model, data,
                                       pitch_cluster, title) {
  k_zone_height_max <- 3.67
  k_zone_height_min <- 1.52
  sides <- c(-0.83, 0.83)
  
  cluster_summary_rr <- data %>%
    filter(Cluster == pitch_cluster) %>%
    summarise(
      spinrate = mean(spinrate, na.rm = TRUE),
      relspeed = mean(relspeed, na.rm = TRUE),
      inducedvertbreak = mean(inducedvertbreak, na.rm = TRUE),
      horzbreak = mean(horzbreak, na.rm = TRUE),
      initposx = mean(initposx, na.rm = TRUE),
      initposz = mean(initposz, na.rm = TRUE)
    )
  
  grid <- expand.grid(
    platelocside = seq(sides[1], sides[2], length.out = 200),
    platelocheight = seq(k_zone_height_min, k_zone_height_max, length.out = 200)
  )
  
  grid$spinrate         <- cluster_summary_rr$spinrate
  grid$relspeed         <- cluster_summary_rr$relspeed
  grid$inducedvertbreak <- cluster_summary_rr$inducedvertbreak
  grid$horzbreak        <- cluster_summary_rr$horzbreak
  grid$initposx         <- cluster_summary_rr$initposx
  grid$initposz         <- cluster_summary_rr$initposz
  grid$Cluster          <- pitch_cluster
  
  grid$pred_prob <- predict(model, newdata = grid, type = "response")
  
  p <- ggplot(grid, aes(platelocside, platelocheight, fill = pred_prob)) +
    geom_tile() +
    scale_fill_viridis_c(option = "C", direction = -1, limits = c(0, 0.5)) +
    geom_rect(
      aes(xmin = sides[1], xmax = sides[2],
          ymin = k_zone_height_min, ymax = k_zone_height_max),
      color = "white", fill = NA, linewidth = 0.8
    ) +
    coord_equal() +
    labs(
      title = paste(title, "Predicted Sweet-Spot Probability — Pitch Type", pitch_cluster),
      x = "Plate Side (ft)",
      y = "Plate Height (ft)",
      fill = "Predicted Prob."
    ) +
    theme_minimal(base_size = 14)
  
  return(p)
}

unique(RHP_RHH_by_cluster$Cluster)
## [1] 1 0 3 4 2
# RHP vs RHH
plot_heatmap_per_pitchtype(GAM_RR, RHP_RHH_by_cluster, 0, "RHP vs RHH")

plot_heatmap_per_pitchtype(GAM_RR, RHP_RHH_by_cluster, 1, "RHP vs RHH")

plot_heatmap_per_pitchtype(GAM_RR, RHP_RHH_by_cluster, 2, "RHP vs RHH")

plot_heatmap_per_pitchtype(GAM_RR, RHP_RHH_by_cluster, 3, "RHP vs RHH")

plot_heatmap_per_pitchtype(GAM_RR, RHP_RHH_by_cluster, 4, "RHP vs RHH")

# RHP vs LHH
plot_heatmap_per_pitchtype(GAM_RL, RHP_LHH_by_cluster, 0, "RHP vs LHH")

plot_heatmap_per_pitchtype(GAM_RL, RHP_LHH_by_cluster, 1, "RHP vs LHH")

plot_heatmap_per_pitchtype(GAM_RL, RHP_LHH_by_cluster, 2, "RHP vs LHH")

plot_heatmap_per_pitchtype(GAM_RL, RHP_LHH_by_cluster, 3, "RHP vs LHH")

plot_heatmap_per_pitchtype(GAM_RL, RHP_LHH_by_cluster, 4, "RHP vs LHH")

# LHP vs RHH
plot_heatmap_per_pitchtype(GAM_LR, LHP_RHH_by_cluster, 0, "LHP vs RHH")

plot_heatmap_per_pitchtype(GAM_LR, LHP_RHH_by_cluster, 1, "LHP vs RHH")

plot_heatmap_per_pitchtype(GAM_LR, LHP_RHH_by_cluster, 2, "LHP vs RHH")

plot_heatmap_per_pitchtype(GAM_LR, LHP_RHH_by_cluster, 3, "LHP vs RHH")

plot_heatmap_per_pitchtype(GAM_LR, LHP_RHH_by_cluster, 4, "LHP vs RHH")

# LHP vs LHH
plot_heatmap_per_pitchtype(GAM_LL, LHP_LHH_by_cluster, 0, "LHP vs LHH")

plot_heatmap_per_pitchtype(GAM_LL, LHP_LHH_by_cluster, 1, "LHP vs LHH")

plot_heatmap_per_pitchtype(GAM_LL, LHP_LHH_by_cluster, 2, "LHP vs LHH")

plot_heatmap_per_pitchtype(GAM_LL, LHP_LHH_by_cluster, 3, "LHP vs LHH")

plot_heatmap_per_pitchtype(GAM_LL, LHP_LHH_by_cluster, 4, "LHP vs LHH")

Doing Zone-based visualizations

plot_zone_per_pitchtype_RHP <- function(model, data, pitch_cluster, title = NULL) {
  # Strike zone bounds
  k_zone_height_max <- 3.67
  k_zone_height_min <- 1.52
  sides <- c(-0.83, 0.83)
  
  # Mean cluster parameters
  cluster_summary_rr <- data %>%
    filter(Cluster == pitch_cluster) %>%
    summarise(
      spinrate = mean(spinrate, na.rm = TRUE),
      relspeed = mean(relspeed, na.rm = TRUE),
      inducedvertbreak = mean(inducedvertbreak, na.rm = TRUE),
      horzbreak = mean(horzbreak, na.rm = TRUE),
      initposx = mean(initposx, na.rm = TRUE),
      initposz = mean(initposz, na.rm = TRUE)
    )
  
  # Grid with small buffer outside zone
  grid <- expand.grid(
    platelocside = seq(sides[1] - 0.5, sides[2] + 0.5, length.out = 300),
    platelocheight = seq(k_zone_height_min - 0.5, k_zone_height_max + 0.5, length.out = 300)
  )
  
  # Add mean parameters
  for (col in names(cluster_summary_rr)) {
    grid[[col]] <- cluster_summary_rr[[col]]
  }
  grid$Cluster <- pitch_cluster
  
  # Predict probabilities
  grid$pred_prob <- predict(model, newdata = grid, type = "response")
  
  # --- Define 25 zones (5×5 grid including chase zones)
  # NOTE: length(breaks) = length(labels) + 1, otherwise NA
  grid <- grid %>%
    mutate(
      zone_col = cut(
        platelocside,
        breaks = seq(sides[1] - 0.5, sides[2] + 0.5, length.out = 6),
        labels = c("Far Outside", "Middle-Out", "Middle", "Middle-In", "Inside"),
        include.lowest = TRUE
      ),
      zone_row = cut(
        platelocheight,
        breaks = seq(k_zone_height_min - 0.5, k_zone_height_max + 0.5,
                     length.out = 6),
        labels = c("Low-Out", "Low", "Mid", "High", "High-Out"),
        include.lowest = TRUE
      )
    )
  
  # Average within zones
  zone_summary <- grid %>%
    group_by(zone_row, zone_col) %>%
    summarise(mean_prob = mean(pred_prob, na.rm = TRUE), .groups = "drop")
  
  # Reorder for baseball orientation (High top, Inside right)
  zone_summary$zone_row <- factor(zone_summary$zone_row,
                                  levels = c("High-Out", "High", "Mid", "Low", "Low-Out"))
  zone_summary$zone_col <- factor(zone_summary$zone_col,
                                  levels = c("Far Outside", "Middle-Out", "Middle", "Middle-In", "Inside"))
  
  zone_summary$zone_col <- factor(zone_summary$zone_col, levels = rev(levels(zone_summary$zone_col)))  # inside → right
  zone_summary$zone_row <- factor(zone_summary$zone_row, levels = rev(levels(zone_summary$zone_row)))  # high → top
  
  p <- ggplot(zone_summary, aes(x = rev(zone_col), y = zone_row, fill = mean_prob)) +
    geom_tile(color = "white", linewidth = 1) +
    geom_text(aes(label = sprintf("%.2f", mean_prob)), color = "white", size = 4.2) +
    scale_fill_viridis_c(option = "C", direction = -1, limits = c(0, 0.5)) +
    labs(
      title = paste0(title, " Predicted Sweet-Spot Probability\n Pitch Type ", pitch_cluster),
      x = "Horizontal Zone (Catcher View)",
      y = "Vertical Zone",
      fill = "Pred. Prob."
    ) +
    coord_equal() +
    scale_x_discrete(position = "top") +
    theme_minimal(base_size = 14) + 
    theme(
      panel.grid = element_blank(),
      axis.text.x = element_text(angle = 0, hjust = 0.5, size=10) ,
      plot.title = element_text(size=15, hjust=0.5)
    ) + 
    geom_rect(
    aes(xmin = as.numeric(factor("Middle-In", levels = levels(zone_summary$zone_col))) - 0.5,
    xmax = as.numeric(factor("Middle-Out", levels = levels(zone_summary$zone_col))) + 0.5,
    ymin = as.numeric(factor("Low", levels = levels(zone_summary$zone_row))) - 0.5,
    ymax = as.numeric(factor("High", levels = levels(zone_summary$zone_row))) + 0.5),
    color = "black", fill = NA, linewidth = 1.5, inherit.aes = FALSE
  )

    return(p)
}

plot_zone_per_pitchtype_RHP(GAM_RR, RHP_RHH_by_cluster, 0, "RHP vs RHH")
## Warning: Use of `zone_summary$zone_col` is discouraged.
## ℹ Use `zone_col` instead.
## Use of `zone_summary$zone_col` is discouraged.
## ℹ Use `zone_col` instead.
## Warning: Use of `zone_summary$zone_row` is discouraged.
## ℹ Use `zone_row` instead.
## Use of `zone_summary$zone_row` is discouraged.
## ℹ Use `zone_row` instead.
## Warning in geom_rect(aes(xmin = as.numeric(factor("Middle-In", levels = levels(zone_summary$zone_col))) - : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP(GAM_RR, RHP_RHH_by_cluster, 1, "RHP vs RHH")
## Warning: Use of `zone_summary$zone_col` is discouraged.
## ℹ Use `zone_col` instead.
## Warning: Use of `zone_summary$zone_col` is discouraged.
## ℹ Use `zone_col` instead.
## Warning: Use of `zone_summary$zone_row` is discouraged.
## ℹ Use `zone_row` instead.
## Use of `zone_summary$zone_row` is discouraged.
## ℹ Use `zone_row` instead.
## Warning in geom_rect(aes(xmin = as.numeric(factor("Middle-In", levels = levels(zone_summary$zone_col))) - : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP(GAM_RR, RHP_RHH_by_cluster, 2, "RHP vs RHH")
## Warning: Use of `zone_summary$zone_col` is discouraged.
## ℹ Use `zone_col` instead.
## Warning: Use of `zone_summary$zone_col` is discouraged.
## ℹ Use `zone_col` instead.
## Warning: Use of `zone_summary$zone_row` is discouraged.
## ℹ Use `zone_row` instead.
## Use of `zone_summary$zone_row` is discouraged.
## ℹ Use `zone_row` instead.
## Warning in geom_rect(aes(xmin = as.numeric(factor("Middle-In", levels = levels(zone_summary$zone_col))) - : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP(GAM_RR, RHP_RHH_by_cluster, 3, "RHP vs RHH")
## Warning: Use of `zone_summary$zone_col` is discouraged.
## ℹ Use `zone_col` instead.
## Warning: Use of `zone_summary$zone_col` is discouraged.
## ℹ Use `zone_col` instead.
## Warning: Use of `zone_summary$zone_row` is discouraged.
## ℹ Use `zone_row` instead.
## Use of `zone_summary$zone_row` is discouraged.
## ℹ Use `zone_row` instead.
## Warning in geom_rect(aes(xmin = as.numeric(factor("Middle-In", levels = levels(zone_summary$zone_col))) - : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

plot_zone_per_pitchtype_RHP(GAM_RR, RHP_RHH_by_cluster, 4, "RHP vs RHH")
## Warning: Use of `zone_summary$zone_col` is discouraged.
## ℹ Use `zone_col` instead.
## Warning: Use of `zone_summary$zone_col` is discouraged.
## ℹ Use `zone_col` instead.
## Warning: Use of `zone_summary$zone_row` is discouraged.
## ℹ Use `zone_row` instead.
## Use of `zone_summary$zone_row` is discouraged.
## ℹ Use `zone_row` instead.
## Warning in geom_rect(aes(xmin = as.numeric(factor("Middle-In", levels = levels(zone_summary$zone_col))) - : All aesthetics have length 1, but the data has 25 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.